In [1]:
import pandas as pd
import os
In [2]:
!pip install category_encoders==2.*
Requirement already satisfied: category_encoders==2.* in c:\users\j8015\anaconda3\lib\site-packages (2.1.0)
Requirement already satisfied: scipy>=0.19.0 in c:\users\j8015\anaconda3\lib\site-packages (from category_encoders==2.*) (1.4.1)
Requirement already satisfied: patsy>=0.4.1 in c:\users\j8015\anaconda3\lib\site-packages (from category_encoders==2.*) (0.5.1)
Requirement already satisfied: statsmodels>=0.6.1 in c:\users\j8015\anaconda3\lib\site-packages (from category_encoders==2.*) (0.11.1)
Requirement already satisfied: scikit-learn>=0.20.0 in c:\users\j8015\anaconda3\lib\site-packages (from category_encoders==2.*) (0.22.1)
Requirement already satisfied: numpy>=1.11.3 in c:\users\j8015\anaconda3\lib\site-packages (from category_encoders==2.*) (1.18.1)
Requirement already satisfied: pandas>=0.21.1 in c:\users\j8015\anaconda3\lib\site-packages (from category_encoders==2.*) (1.0.1)
Requirement already satisfied: six in c:\users\j8015\anaconda3\lib\site-packages (from patsy>=0.4.1->category_encoders==2.*) (1.14.0)
Requirement already satisfied: joblib>=0.11 in c:\users\j8015\anaconda3\lib\site-packages (from scikit-learn>=0.20.0->category_encoders==2.*) (0.14.1)
Requirement already satisfied: pytz>=2017.2 in c:\users\j8015\anaconda3\lib\site-packages (from pandas>=0.21.1->category_encoders==2.*) (2019.3)
Requirement already satisfied: python-dateutil>=2.6.1 in c:\users\j8015\anaconda3\lib\site-packages (from pandas>=0.21.1->category_encoders==2.*) (2.8.1)
In [4]:
!pip install pandas-profiling==2.*
Requirement already satisfied: pandas-profiling==2.* in c:\users\j8015\anaconda3\lib\site-packages (2.6.0)
Requirement already satisfied: requests>=2.23.0 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (2.23.0)
Requirement already satisfied: scipy>=1.4.1 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (1.4.1)
Requirement already satisfied: statsmodels>=0.11.1 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (0.11.1)
Requirement already satisfied: phik>=0.9.10 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (0.9.11)
Requirement already satisfied: confuse>=1.0.0 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (1.1.0)
Requirement already satisfied: htmlmin>=0.1.12 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (0.1.12)
Requirement already satisfied: matplotlib>=3.2.0 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (3.2.1)
Requirement already satisfied: astropy>=4.0 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (4.0)
Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (0.0.4)
Requirement already satisfied: visions[type_image_path]>=0.4.1 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (0.4.1)
Requirement already satisfied: missingno>=0.4.2 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (0.4.2)
Requirement already satisfied: tqdm>=4.43.0 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (4.45.0)
Requirement already satisfied: ipywidgets>=7.5.1 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (7.5.1)
Requirement already satisfied: pandas>=0.25.3 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (1.0.1)
Requirement already satisfied: numpy>=1.16.0 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (1.18.1)
Requirement already satisfied: jinja2>=2.11.1 in c:\users\j8015\anaconda3\lib\site-packages (from pandas-profiling==2.*) (2.11.1)
Requirement already satisfied: idna<3,>=2.5 in c:\users\j8015\anaconda3\lib\site-packages (from requests>=2.23.0->pandas-profiling==2.*) (2.8)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\users\j8015\anaconda3\lib\site-packages (from requests>=2.23.0->pandas-profiling==2.*) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\j8015\anaconda3\lib\site-packages (from requests>=2.23.0->pandas-profiling==2.*) (2019.11.28)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\j8015\anaconda3\lib\site-packages (from requests>=2.23.0->pandas-profiling==2.*) (1.25.8)
Requirement already satisfied: patsy>=0.5 in c:\users\j8015\anaconda3\lib\site-packages (from statsmodels>=0.11.1->pandas-profiling==2.*) (0.5.1)
Requirement already satisfied: joblib>=0.14.1 in c:\users\j8015\anaconda3\lib\site-packages (from phik>=0.9.10->pandas-profiling==2.*) (0.14.1)
Requirement already satisfied: numba>=0.38.1 in c:\users\j8015\anaconda3\lib\site-packages (from phik>=0.9.10->pandas-profiling==2.*) (0.48.0)
Requirement already satisfied: pyyaml in c:\users\j8015\anaconda3\lib\site-packages (from confuse>=1.0.0->pandas-profiling==2.*) (5.3)
Requirement already satisfied: cycler>=0.10 in c:\users\j8015\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.*) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\j8015\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.*) (2.8.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\users\j8015\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.*) (2.4.6)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\j8015\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.*) (1.1.0)
Requirement already satisfied: networkx>=2.4 in c:\users\j8015\anaconda3\lib\site-packages (from visions[type_image_path]>=0.4.1->pandas-profiling==2.*) (2.4)
Requirement already satisfied: attrs>=19.3.0 in c:\users\j8015\anaconda3\lib\site-packages (from visions[type_image_path]>=0.4.1->pandas-profiling==2.*) (19.3.0)
Requirement already satisfied: imagehash; extra == "type_image_path" in c:\users\j8015\anaconda3\lib\site-packages (from visions[type_image_path]>=0.4.1->pandas-profiling==2.*) (4.0)
Requirement already satisfied: Pillow; extra == "type_image_path" in c:\users\j8015\anaconda3\lib\site-packages (from visions[type_image_path]>=0.4.1->pandas-profiling==2.*) (7.0.0)
Requirement already satisfied: seaborn in c:\users\j8015\anaconda3\lib\site-packages (from missingno>=0.4.2->pandas-profiling==2.*) (0.10.0)
Requirement already satisfied: traitlets>=4.3.1 in c:\users\j8015\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.*) (4.3.3)
Requirement already satisfied: widgetsnbextension~=3.5.0 in c:\users\j8015\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.*) (3.5.1)
Requirement already satisfied: ipython>=4.0.0; python_version >= "3.3" in c:\users\j8015\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.*) (7.12.0)
Requirement already satisfied: nbformat>=4.2.0 in c:\users\j8015\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.*) (5.0.4)
Requirement already satisfied: ipykernel>=4.5.1 in c:\users\j8015\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.*) (5.1.4)
Requirement already satisfied: pytz>=2017.2 in c:\users\j8015\anaconda3\lib\site-packages (from pandas>=0.25.3->pandas-profiling==2.*) (2019.3)
Requirement already satisfied: MarkupSafe>=0.23 in c:\users\j8015\anaconda3\lib\site-packages (from jinja2>=2.11.1->pandas-profiling==2.*) (1.1.1)
Requirement already satisfied: six in c:\users\j8015\anaconda3\lib\site-packages (from patsy>=0.5->statsmodels>=0.11.1->pandas-profiling==2.*) (1.14.0)
Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in c:\users\j8015\anaconda3\lib\site-packages (from numba>=0.38.1->phik>=0.9.10->pandas-profiling==2.*) (0.31.0)
Requirement already satisfied: setuptools in c:\users\j8015\anaconda3\lib\site-packages (from numba>=0.38.1->phik>=0.9.10->pandas-profiling==2.*) (45.2.0.post20200210)
Requirement already satisfied: decorator>=4.3.0 in c:\users\j8015\anaconda3\lib\site-packages (from networkx>=2.4->visions[type_image_path]>=0.4.1->pandas-profiling==2.*) (4.4.1)
Requirement already satisfied: pywavelets in c:\users\j8015\anaconda3\lib\site-packages (from imagehash; extra == "type_image_path"->visions[type_image_path]>=0.4.1->pandas-profiling==2.*) (1.1.1)
Requirement already satisfied: ipython-genutils in c:\users\j8015\anaconda3\lib\site-packages (from traitlets>=4.3.1->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.2.0)
Requirement already satisfied: notebook>=4.4.1 in c:\users\j8015\anaconda3\lib\site-packages (from widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (6.0.3)
Requirement already satisfied: pickleshare in c:\users\j8015\anaconda3\lib\site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.7.5)
Requirement already satisfied: pygments in c:\users\j8015\anaconda3\lib\site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets>=7.5.1->pandas-profiling==2.*) (2.5.2)
Requirement already satisfied: jedi>=0.10 in c:\users\j8015\anaconda3\lib\site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.14.1)
Requirement already satisfied: colorama; sys_platform == "win32" in c:\users\j8015\anaconda3\lib\site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.4.3)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in c:\users\j8015\anaconda3\lib\site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets>=7.5.1->pandas-profiling==2.*) (3.0.3)
Requirement already satisfied: backcall in c:\users\j8015\anaconda3\lib\site-packages (from ipython>=4.0.0; python_version >= "3.3"->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.1.0)
Requirement already satisfied: jupyter-core in c:\users\j8015\anaconda3\lib\site-packages (from nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (4.6.1)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in c:\users\j8015\anaconda3\lib\site-packages (from nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (3.2.0)
Requirement already satisfied: tornado>=4.2 in c:\users\j8015\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.*) (6.0.3)
Requirement already satisfied: jupyter-client in c:\users\j8015\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.*) (5.3.4)
Requirement already satisfied: prometheus-client in c:\users\j8015\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.7.1)
Requirement already satisfied: Send2Trash in c:\users\j8015\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (1.5.0)
Requirement already satisfied: terminado>=0.8.1 in c:\users\j8015\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.8.3)
Requirement already satisfied: nbconvert in c:\users\j8015\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (5.6.1)
Requirement already satisfied: pyzmq>=17 in c:\users\j8015\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (18.1.1)
Requirement already satisfied: parso>=0.5.0 in c:\users\j8015\anaconda3\lib\site-packages (from jedi>=0.10->ipython>=4.0.0; python_version >= "3.3"->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.5.2)
Requirement already satisfied: wcwidth in c:\users\j8015\anaconda3\lib\site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0; python_version >= "3.3"->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.1.8)
Requirement already satisfied: pywin32>=1.0; sys_platform == "win32" in c:\users\j8015\anaconda3\lib\site-packages (from jupyter-core->nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (227)
Requirement already satisfied: importlib-metadata; python_version < "3.8" in c:\users\j8015\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (1.5.0)
Requirement already satisfied: pyrsistent>=0.14.0 in c:\users\j8015\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.15.7)
Requirement already satisfied: mistune<2,>=0.8.1 in c:\users\j8015\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.8.4)
Requirement already satisfied: entrypoints>=0.2.2 in c:\users\j8015\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.3)
Requirement already satisfied: bleach in c:\users\j8015\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (3.1.0)
Requirement already satisfied: pandocfilters>=1.4.1 in c:\users\j8015\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (1.4.2)
Requirement already satisfied: testpath in c:\users\j8015\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.4.4)
Requirement already satisfied: defusedxml in c:\users\j8015\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.6.0)
Requirement already satisfied: zipp>=0.5 in c:\users\j8015\anaconda3\lib\site-packages (from importlib-metadata; python_version < "3.8"->jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (2.2.0)
Requirement already satisfied: webencodings in c:\users\j8015\anaconda3\lib\site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.*) (0.5.1)
In [3]:
sample_submission=pd.read_csv('sample_submission.csv')
print(sample_submission.shape)
sample_submission.tail(2)
(60980, 29)
Out[3]:
id F1 F2 F3 F4 F5 F6 F7 F8 F9 ... F19 F20 F21 F22 F23 F24 F25 F26 F27 F28
60978 FOODS_3_826_WI_3_evaluation 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
60979 FOODS_3_827_WI_3_evaluation 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 29 columns

In [5]:
sales_train=pd.read_csv('sales_train_validation.csv')
print(sales_train.shape)
sales_train.head()
(30490, 1919)
Out[5]:
id item_id dept_id cat_id store_id state_id d_1 d_2 d_3 d_4 ... d_1904 d_1905 d_1906 d_1907 d_1908 d_1909 d_1910 d_1911 d_1912 d_1913
0 HOBBIES_1_001_CA_1_validation HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 1 3 0 1 1 1 3 0 1 1
1 HOBBIES_1_002_CA_1_validation HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
2 HOBBIES_1_003_CA_1_validation HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 2 1 2 1 1 1 0 1 1 1
3 HOBBIES_1_004_CA_1_validation HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 1 0 5 4 1 0 1 3 7 2
4 HOBBIES_1_005_CA_1_validation HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 2 1 1 0 1 1 2 2 2 4

5 rows × 1919 columns

In [10]:
#Contains information about the dates on which the products are sold.

calendar=pd.read_csv('calendar.csv')
print(calendar.shape)
calendar.head()
(1969, 14)
Out[10]:
date wm_yr_wk weekday wday month year d event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI
0 2011-01-29 11101 Saturday 1 1 2011 d_1 NaN NaN NaN NaN 0 0 0
1 2011-01-30 11101 Sunday 2 1 2011 d_2 NaN NaN NaN NaN 0 0 0
2 2011-01-31 11101 Monday 3 1 2011 d_3 NaN NaN NaN NaN 0 0 0
3 2011-02-01 11101 Tuesday 4 2 2011 d_4 NaN NaN NaN NaN 1 1 0
4 2011-02-02 11101 Wednesday 5 2 2011 d_5 NaN NaN NaN NaN 1 0 1
In [6]:
#Contains information about the price of the products sold per store and date.

sell_prices=pd.read_csv('sell_prices.csv')
print(sell_prices.shape)
sell_prices.head()
(6841121, 4)
Out[6]:
store_id item_id wm_yr_wk sell_price
0 CA_1 HOBBIES_1_001 11325 9.58
1 CA_1 HOBBIES_1_001 11326 9.58
2 CA_1 HOBBIES_1_001 11327 8.26
3 CA_1 HOBBIES_1_001 11328 8.26
4 CA_1 HOBBIES_1_001 11329 8.26
In [7]:
# Randomly sample 10% of your "sell_prices" dataframe

sell_sample = sell_prices.sample(frac=0.05, random_state=42)
sell_sample.shape
Out[7]:
(342056, 4)
In [11]:
#Merge "sell_sample" and "calendar" on `wm_yr_wk`

sell_calendar=pd.merge(sell_sample, calendar, on=['wm_yr_wk'])
print(sell_calendar.shape)
sell_calendar.head(2)
(2386857, 17)
Out[11]:
store_id item_id wm_yr_wk sell_price date weekday wday month year d event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI
0 TX_3 FOODS_3_295 11448 0.8 2014-12-27 Saturday 1 12 2014 d_1429 NaN NaN NaN NaN 0 0 0
1 TX_3 FOODS_3_295 11448 0.8 2014-12-28 Sunday 2 12 2014 d_1430 NaN NaN NaN NaN 0 0 0
In [12]:
#Adding and "id" column in `sell_calendar`

sell_calendar['id']=sell_calendar['item_id']+'_'+sell_calendar['store_id']+'_validation'
sell_calendar.shape
Out[12]:
(2386857, 18)
In [13]:
sale_train=pd.DataFrame(sales_train.loc[:,'id':'d_100'])
sale_train.shape
Out[13]:
(30490, 106)
In [25]:
#Merge "sell_calendar" and "sales_train" on `id`

sales=pd.merge(sell_calendar,sale_train,  on=['id'])
print(sales.shape)
sales.head(2)
(2386857, 123)
Out[25]:
store_id_x item_id_x wm_yr_wk sell_price date weekday wday month year d ... d_91 d_92 d_93 d_94 d_95 d_96 d_97 d_98 d_99 d_100
0 TX_3 FOODS_3_295 11448 0.8 2014-12-27 Saturday 1 12 2014 d_1429 ... 0 0 0 0 0 0 0 0 0 0
1 TX_3 FOODS_3_295 11448 0.8 2014-12-28 Sunday 2 12 2014 d_1430 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 123 columns

In [29]:
import csv
In [ ]:
sales.to_csv()
In [17]:
sales.nunique()
Out[17]:
id              30425
item_id_x        3049
dept_id             7
cat_id              3
store_id_x         10
                ...  
event_name_2        4
event_type_2        2
snap_CA             2
snap_TX             2
snap_WI             2
Length: 123, dtype: int64
In [20]:
from pandas_profiling import ProfileReport
profile = ProfileReport(sales, minimal=True).to_notebook_iframe()
profile





C:\Users\J8015\anaconda3\lib\site-packages\IPython\core\display.py:701: UserWarning: Consider using IPython.display.IFrame instead
  warnings.warn("Consider using IPython.display.IFrame instead")
In [ ]: